setwd('/Users/wendyvu/Documents/PCA_txns/')
df <- read.csv('/Users/wendyvu/Documents/PCA_txns/txn_pca_cohort_SeptOct2018_202002261339.csv',header=T)
#library(data.table)
#df <- df[df[,3] %like% "2018-10",]
dim(df)
## [1] 163752 140
# replace NA's with 0's
df[is.na(df)] <- 0
summary(df)
## id user_created
## Min. : 1 2018-09-01 09:26:25: 4
## 1st Qu.: 40939 2018-09-05 18:16:52: 3
## Median : 81876 2018-09-06 11:16:54: 3
## Mean : 81876 2018-09-09 18:01:11: 3
## 3rd Qu.:122814 2018-09-10 10:33:50: 3
## Max. :163752 2018-09-11 08:53:21: 3
## (Other) :163733
## kycc membership
## 2018-09-09 14:09:00: 5 premium : 10620
## 2018-09-25 06:51:45: 5 standard:153132
## 2018-09-25 07:55:18: 5
## 2018-09-25 13:26:22: 5
## 2018-09-25 16:28:24: 5
## 2018-09-10 10:52:31: 4
## (Other) :163723
## product_id market nationality
## BLACK_CARD_MONTHLY : 5924 FRA :50361 FRA :46582
## BUSINESS_BLACK : 1160 DEU :37913 ITA :36359
## BUSINESS_CARD : 13478 ITA :37142 DEU :22804
## FLEX_ACCOUNT_MONTHLY: 1323 RoE :17835 ESP :13328
## METAL_CARD_MONTHLY : 3536 ESP :15709 BRA : 4780
## STANDARD :138331 AUT : 4710 AUT : 3568
## (Other): 82 (Other):36331
## nat_status referral closed_at
## expat : 35018 friend_referral:32312 :141270
## native:128734 organic_search :28186 2019-07-30: 1492
## paid_search :22492 2019-01-10: 870
## app_store :22086 2019-08-25: 734
## :15637 2019-07-29: 626
## paid_social :13452 2019-01-24: 496
## (Other) :29587 (Other) : 18264
## kycc_closed_days gender age_group age
## Min. :-14.00 FEMALE: 47482 20-24 :37285 Min. : 20.00
## 1st Qu.: 0.00 MALE :116270 25-29 :35499 1st Qu.: 25.00
## Median : 0.00 30-34 :25225 Median : 31.00
## Mean : 35.18 35-39 :17463 Mean : 34.37
## 3rd Qu.: 0.00 40-44 :12616 3rd Qu.: 41.00
## Max. :542.00 45-49 :10202 Max. :120.00
## (Other):25462
## weeks_wau_txn mau_txn months_sau mau_act
## Min. :-417.00 Min. :-16.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 2.00 1st Qu.: 2.000 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 14.00 Median : 6.000 Median : 0.0000 Median : 4.000
## Mean : 20.59 Mean : 6.298 Mean : 0.6625 Mean : 4.843
## 3rd Qu.: 38.00 3rd Qu.: 11.000 3rd Qu.: 0.0000 3rd Qu.: 9.000
## Max. : 60.00 Max. : 12.000 Max. :15.0000 Max. :12.000
##
## n_pt_dom n_pt_intra n_pt_inter n_pt_ecomm
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 5.00 Median : 0.00 Median : 0.000 Median : 2.00
## Mean : 48.62 Mean : 12.86 Mean : 5.696 Mean : 16.01
## 3rd Qu.: 53.00 3rd Qu.: 8.00 3rd Qu.: 0.000 3rd Qu.: 15.00
## Max. :1525.00 Max. :1588.00 Max. :1514.000 Max. :4216.00
##
## n_pt_dom_atm n_pt_intra_atm n_pt_inter_atm n_dt
## Min. : 0.000 Min. : 0.0000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.000 Median : 0.0000 Median : 0.000 Median : 0.000
## Mean : 6.314 Mean : 0.8071 Mean : 1.089 Mean : 5.963
## 3rd Qu.: 5.000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 4.000
## Max. :367.000 Max. :196.0000 Max. :416.000 Max. :1184.000
##
## n_dd n_ft n_cash26 n_ct
## Min. : 0.000 Min. : 0.000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 1.00
## Median : 0.000 Median : 0.000 Median : 0.0000 Median : 6.00
## Mean : 3.924 Mean : 1.263 Mean : 0.2392 Mean : 14.12
## 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 0.0000 3rd Qu.: 18.00
## Max. :3114.000 Max. :312.000 Max. :197.0000 Max. :4403.00
##
## n_wu n_space_ct n_space_dt
## Min. : 0.0000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.0000 Median : 0.000 Median : 0.000
## Mean : 0.2922 Mean : 6.498 Mean : 6.498
## 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :108.0000 Max. :1292.000 Max. :1292.000
##
## pt_dom_sum pt_intra_sum pt_inter_sum
## Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 75.16 Median : 0.0 Median : 0.0
## Mean : 1142.74 Mean : 311.7 Mean : 191.8
## 3rd Qu.: 1106.82 3rd Qu.: 165.5 3rd Qu.: 0.0
## Max. :115460.67 Max. :103485.7 Max. :74898.3
##
## pt_ecomm_sum pt_dom_atm_sum pt_intra_atm_sum
## Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.00
## Median : 38.94 Median : 10.0 Median : 0.00
## Mean : 806.31 Mean : 936.5 Mean : 99.54
## 3rd Qu.: 666.12 3rd Qu.: 450.0 3rd Qu.: 0.00
## Max. :158778.06 Max. :169590.0 Max. :96618.66
##
## pt_inter_atm_sum dt_sum dd_sum
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 163.4 Mean : 2602.6 Mean : 318.2
## 3rd Qu.: 0.0 3rd Qu.: 766.1 3rd Qu.: 0.0
## Max. :84347.5 Max. :2000000.0 Max. :179000.0
##
## ft_sum cash26_sum ct_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 4.1
## Median : 0.00 Median : 0.00 Median : 1301.9
## Mean : 95.78 Mean : 58.96 Mean : 7419.7
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 7135.0
## Max. :95530.00 Max. :91489.97 Max. :2202467.4
##
## wu_sum avg_primary_bal avg_spaces_bal
## Min. : 0.00 Min. : -11121.2 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.00 Median : 40.2 Median : 0.0
## Mean : 4.28 Mean : 598.3 Mean : 129.7
## 3rd Qu.: 0.00 3rd Qu.: 295.7 3rd Qu.: 0.0
## Max. :1620.00 Max. :1428571.4 Max. :136363.6
##
## n_spaces avg_space_ct avg_space_dt n_ext_out
## Min. : 0.0000 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.: 0 1st Qu.: 0 1st Qu.: 1.0
## Median : 0.0000 Median : 0 Median : 0 Median : 27.0
## Mean : 0.4746 Mean : 1201 Mean : 1201 Mean : 102.7
## 3rd Qu.: 1.0000 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 134.0
## Max. :37.0000 Max. :8900000 Max. :8900000 Max. :4250.0
##
## n_ext_in ext_out_sum ext_in_sum grocery_market
## Min. : 0.00 Min. : 0.0 Min. : 0 Min. : 0.00
## 1st Qu.: 1.00 1st Qu.: 9.6 1st Qu.: 10 1st Qu.: 0.00
## Median : 7.00 Median : 1184.8 Median : 1386 Median : 1.00
## Mean : 15.61 Mean : 6702.6 Mean : 7564 Mean : 16.33
## 3rd Qu.: 20.00 3rd Qu.: 6603.2 3rd Qu.: 7364 3rd Qu.: 14.00
## Max. :4407.00 Max. :2000000.0 Max. :2203650 Max. :1099.00
##
## restaurant atm fast_food local_transport
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 1.000 Median : 0.000 Median : 0.000
## Mean : 9.094 Mean : 8.204 Mean : 5.715 Mean : 4.083
## 3rd Qu.: 7.000 3rd Qu.: 9.000 3rd Qu.: 5.000 3rd Qu.: 2.000
## Max. :690.000 Max. :417.000 Max. :525.000 Max. :419.000
##
## clothing retail household gas_service
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000 Median : 0.000 Median : 0.000
## Mean : 3.668 Mean : 3.687 Mean : 3.635 Mean : 3.171
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 2.000 3rd Qu.: 1.000
## Max. :833.000 Max. :1440.000 Max. :365.000 Max. :552.000
##
## food_drinks taxicabs drug_pharma bars_clubs
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000 Median : 0.000 Median : 0.000
## Mean : 2.578 Mean : 2.242 Mean : 2.007 Mean : 1.979
## 3rd Qu.: 1.000 3rd Qu.: 0.000 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :525.000 Max. :688.000 Max. :274.000 Max. :318.000
##
## car_toll_parking entertainment utilities subscriptions
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000 Median : 0.000 Median : 0.000
## Mean : 1.762 Mean : 1.603 Mean : 1.506 Mean : 1.347
## 3rd Qu.: 0.000 3rd Qu.: 1.000 3rd Qu.: 1.000 3rd Qu.: 0.000
## Max. :568.000 Max. :548.000 Max. :372.000 Max. :2431.000
##
## bookstores business_org_serv hotel_lodge computer_electronic
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.000 Median : 0.000 Median : 0.00 Median : 0.000
## Mean : 1.291 Mean : 1.414 Mean : 1.27 Mean : 1.228
## 3rd Qu.: 0.000 3rd Qu.: 1.000 3rd Qu.: 1.00 3rd Qu.: 1.000
## Max. :851.000 Max. :731.000 Max. :130.00 Max. :409.000
##
## bakeries gambling_gaming record_stores
## Min. : 0.000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.000 Median : 0.000 Median : 0.0000
## Mean : 1.233 Mean : 1.137 Mean : 0.9117
## 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 0.0000
## Max. :265.000 Max. :1186.000 Max. :350.0000
##
## digital_goods airline beauty_stores
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean : 0.8821 Mean : 0.7223 Mean : 0.6941
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :405.0000 Max. :241.0000 Max. :119.0000
##
## transport_serv fines_tax_gov money_financial
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean : 0.6354 Mean : 0.6428 Mean : 0.5297
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :229.0000 Max. :363.0000 Max. :288.0000
##
## professional_serv discount_stores travel_agencies
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean : 0.5188 Mean : 0.5192 Mean : 0.4319
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :843.0000 Max. :831.0000 Max. :157.0000
##
## computer_data_serv car_rental health_serv
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean : 0.4067 Mean : 0.3669 Mean : 0.2615
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :921.0000 Max. :431.0000 Max. :74.0000
##
## advertising_serv education dating_serv
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00000
## Median : 0.0000 Median : 0.0000 Median : 0.00000
## Mean : 0.2417 Mean : 0.1612 Mean : 0.04859
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.00000
## Max. :341.0000 Max. :251.0000 Max. :109.00000
##
## no_cat grocery_market_sum restaurant_sum atm_sum
## Min. : 0.000 Min. : 0.000 Min. : 0.0 Min. : 0
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0 1st Qu.: 0
## Median : 0.000 Median : 2.025 Median : 0.0 Median : 30
## Mean : 3.242 Mean : 266.700 Mean : 192.0 Mean : 1198
## 3rd Qu.: 2.000 3rd Qu.: 212.040 3rd Qu.: 146.9 3rd Qu.: 839
## Max. :1750.000 Max. :30770.920 Max. :27987.1 Max. :169590
##
## fast_food_sum local_transport_sum clothing_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0
## Median : 0.00 Median : 0.00 Median : 0.0
## Mean : 63.55 Mean : 72.98 Mean : 193.4
## 3rd Qu.: 51.86 3rd Qu.: 28.39 3rd Qu.: 139.9
## Max. :10683.87 Max. :34657.77 Max. :108255.7
##
## retail_sum household_sum gas_service_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 123.35 Mean : 158.30 Mean : 94.09
## 3rd Qu.: 86.42 3rd Qu.: 61.45 3rd Qu.: 24.91
## Max. :53229.67 Max. :46243.17 Max. :22330.86
##
## food_drinks_sum taxicabs_sum drug_pharma_sum bars_clubs_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 35.09 Mean : 27.46 Mean : 32.03 Mean : 36.92
## 3rd Qu.: 12.37 3rd Qu.: 0.00 3rd Qu.: 13.34 3rd Qu.: 5.99
## Max. :13616.95 Max. :9402.01 Max. :8536.29 Max. :41767.00
##
## car_toll_parking_sum entertainment_sum utilities_sum
## Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00
## Median : 0.0 Median : 0.0 Median : 0.00
## Mean : 12.3 Mean : 50.9 Mean : 41.35
## 3rd Qu.: 0.0 3rd Qu.: 22.5 3rd Qu.: 5.00
## Max. :10896.0 Max. :57889.7 Max. :20867.86
##
## subscriptions_sum bookstores_sum business_org_serv_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 49.09 Mean : 37.74 Mean : 57.52
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 6.61
## Max. :55001.20 Max. :15765.70 Max. :36623.60
##
## hotel_lodge_sum computer_electronic_sum bakeries_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 129.95 Mean : 84.72 Mean : 9.151
## 3rd Qu.: 13.39 3rd Qu.: 12.28 3rd Qu.: 0.000
## Max. :54631.60 Max. :52656.00 Max. :2835.200
##
## gambling_gaming_sum record_stores_sum digital_goods_sum
## Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 0.00 Median : 0.000 Median : 0.00
## Mean : 60.87 Mean : 8.281 Mean : 12.76
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00
## Max. :110381.00 Max. :22874.270 Max. :22165.19
##
## airline_sum beauty_stores_sum transport_serv_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 98.08 Mean : 25.09 Mean : 7.888
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :47573.15 Max. :28088.09 Max. :16476.000
##
## fines_tax_gov_sum money_financial_sum professional_serv_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 22.08 Mean : 90.73 Mean : 19.98
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :70762.28 Max. :69678.22 Max. :30000.00
##
## discount_stores_sum travel_agencies_sum computer_data_serv_sum
## Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.00
## Median : 0.00 Median : 0.0 Median : 0.00
## Mean : 11.87 Mean : 61.1 Mean : 13.79
## 3rd Qu.: 0.00 3rd Qu.: 0.0 3rd Qu.: 0.00
## Max. :44565.12 Max. :35095.8 Max. :28945.56
##
## car_rental_sum health_serv_sum advertising_serv_sum
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 25.05 Mean : 20.98 Mean : 17.11
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :34410.49 Max. :18238.78 Max. :151631.37
##
## education_sum dating_serv_sum no_cat_sum cohort
## Min. : 0.00 Min. : 0.000 Min. : 0.00 2018-09:82932
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 2018-10:80820
## Median : 0.00 Median : 0.000 Median : 0.00
## Mean : 17.88 Mean : 1.141 Mean : 171.23
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 70.99
## Max. :32999.97 Max. :5812.690 Max. :51036.75
##
## continent cntry_nationality round
## AFRICA : 3417 FRANCE :46582 Min. :-1625597.0
## ASIA : 4726 ITALY :36359 1st Qu.: -180.4
## EUROPE :146205 GERMANY:22804 Median : 5.0
## NORTH AMERICA: 2426 SPAIN :13328 Mean : 2436.6
## OCEANIA : 473 BRAZIL : 4780 3rd Qu.: 2010.0
## SOUTH AMERICA: 6505 AUSTRIA: 3568 Max. : 504132.0
## (Other):36331
## pnl_cat
## neg_pnl: 57398
## pos_pnl:106354
##
##
##
##
##
# REMOVE users that did not make an external deposit
# (77% made at least 1 ext deposit)
df1 <- df[df$mau_act > 0,]
dim(df1)
## [1] 125356 140
mau0 <- df[df$mau_act == 0,]
# split Business accounts
bus <- df1[df1$product_id %in% c('BUSINESS_CARD','BUSINESS_BLACK'),]
dim(bus)
## [1] 11502 140
#split SAUs (users with >= 1 week SAU flag)
# 11% of users have at least 1 week of SAU flag
summary(df1$months_sau)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.8654 0.0000 15.0000
sau <- df1[df1$months_sau > 0,]
dim(sau)
## [1] 18031 140
# TYPICAL N26 USERS
# NON-business accounts N= 97031
#df1 <- df1[(df1$months_sau == 0),]
df1 <- df1[!df1$product_id %in% c('BUSINESS_CARD','BUSINESS_BLACK'),]
dim(df1)
## [1] 113854 140
head(df1)
## id user_created kycc membership product_id market
## 1 1 2015-05-25 18:23:10 2018-09-03 15:29:47 standard STANDARD DEU
## 2 2 2015-07-20 08:56:19 2018-10-26 17:03:30 standard STANDARD AUT
## 3 3 2015-07-29 11:20:35 2018-10-24 13:13:25 standard STANDARD
## 4 4 2015-08-26 14:07:42 2018-10-12 11:14:12 standard STANDARD AUT
## 5 5 2015-08-29 13:43:24 2018-09-29 14:28:19 standard STANDARD
## 7 7 2015-09-24 18:02:53 2018-09-17 14:45:10 standard STANDARD
## nationality nat_status referral closed_at kycc_closed_days
## 1 DEU native 0
## 2 AUT native 0
## 3 AUT native 0
## 4 AUT native 2019-11-28 412
## 5 AUT native friend_referral 0
## 7 AUT native 0
## gender age_group age weeks_wau_txn mau_txn months_sau mau_act n_pt_dom
## 1 MALE 25-29 30 40 12 0 12 8
## 2 MALE 25-29 27 3 2 0 1 1
## 3 MALE 45-49 48 13 8 0 4 0
## 4 MALE 30-34 32 4 3 0 2 0
## 5 MALE 30-34 35 35 11 0 7 39
## 7 MALE 25-29 27 51 12 0 6 120
## n_pt_intra n_pt_inter n_pt_ecomm n_pt_dom_atm n_pt_intra_atm
## 1 105 0 53 2 2
## 2 0 0 0 0 0
## 3 4 3 7 0 1
## 4 0 0 0 3 0
## 5 23 0 13 6 1
## 7 5 0 11 0 0
## n_pt_inter_atm n_dt n_dd n_ft n_cash26 n_ct n_wu n_space_ct n_space_dt
## 1 0 0 0 1 0 23 0 5 5
## 2 0 0 0 1 0 2 0 0 0
## 3 0 0 0 0 3 2 0 0 0
## 4 0 0 0 0 0 3 0 0 0
## 5 0 0 0 0 0 10 0 0 0
## 7 0 1 0 0 0 11 0 0 0
## pt_dom_sum pt_intra_sum pt_inter_sum pt_ecomm_sum pt_dom_atm_sum
## 1 124.73 1143.68 0.00 1693.19 20
## 2 51.60 0.00 0.00 0.00 0
## 3 0.00 42.79 338.02 1045.59 0
## 4 0.00 0.00 0.00 0.00 30
## 5 2420.15 319.73 0.00 539.61 1260
## 7 659.91 35.95 0.00 62.09 0
## pt_intra_atm_sum pt_inter_atm_sum dt_sum dd_sum ft_sum cash26_sum
## 1 70.00 0 0 0 3.25 0
## 2 0.00 0 0 0 31.00 0
## 3 174.43 0 0 0 0.00 1000
## 4 0.00 0 0 0 0.00 0
## 5 120.00 0 0 0 0.00 0
## 7 0.00 0 10 0 0.00 0
## ct_sum wu_sum avg_primary_bal avg_spaces_bal n_spaces avg_space_ct
## 1 2815.76 0 -86.437 0 2 28.36879
## 2 101.00 0 15.771 0 0 0.00000
## 3 850.00 0 435.727 0 0 0.00000
## 4 71.90 0 -40.150 0 2 0.00000
## 5 4826.00 0 312.121 0 0 0.00000
## 7 810.00 0 60.531 0 0 0.00000
## avg_space_dt n_ext_out n_ext_in ext_out_sum ext_in_sum grocery_market
## 1 28.36879 171 24 3054.85 2817.26 14
## 2 0.00000 2 2 82.60 101.00 0
## 3 0.00000 15 5 1600.83 1850.00 0
## 4 0.00000 3 3 30.00 71.90 0
## 5 0.00000 82 12 4659.49 4871.05 10
## 7 0.00000 138 17 771.14 852.71 90
## restaurant atm fast_food local_transport clothing retail household
## 1 20 4 9 4 0 4 2
## 2 1 0 0 0 0 0 0
## 3 4 1 0 0 1 0 0
## 4 0 3 0 0 0 0 0
## 5 1 7 0 0 4 2 0
## 7 4 0 12 10 3 0 0
## gas_service food_drinks taxicabs drug_pharma bars_clubs car_toll_parking
## 1 1 9 10 2 36 0
## 2 0 0 0 0 0 0
## 3 0 0 1 0 0 0
## 4 0 0 0 0 0 0
## 5 1 0 21 2 0 0
## 7 0 0 0 0 0 0
## entertainment utilities subscriptions bookstores business_org_serv
## 1 0 0 3 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 1
## 4 0 0 0 0 0
## 5 0 0 1 0 0
## 7 0 0 11 1 0
## hotel_lodge computer_electronic bakeries gambling_gaming record_stores
## 1 1 1 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 1 21 1 0 0
## 7 0 2 1 0 0
## digital_goods airline beauty_stores transport_serv fines_tax_gov
## 1 4 1 0 4 1
## 2 0 0 0 0 0
## 3 0 3 0 0 0
## 4 0 0 0 0 0
## 5 6 0 0 0 0
## 7 0 0 0 0 0
## money_financial professional_serv discount_stores travel_agencies
## 1 35 4 0 1
## 2 0 0 0 0
## 3 0 0 0 3
## 4 0 0 0 0
## 5 0 0 2 0
## 7 0 0 0 0
## computer_data_serv car_rental health_serv advertising_serv education
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 7 0 0 0 0 0
## dating_serv no_cat grocery_market_sum restaurant_sum atm_sum
## 1 0 0 178.34 298.70 90.00
## 2 0 0 0.00 51.60 0.00
## 3 0 1 0.00 42.79 174.43
## 4 0 0 0.00 0.00 30.00
## 5 0 2 418.86 15.00 1380.00
## 7 0 2 445.88 62.40 0.00
## fast_food_sum local_transport_sum clothing_sum retail_sum household_sum
## 1 60.20 45.85 0.00 18.75 21.5
## 2 0.00 0.00 0.00 0.00 0.0
## 3 0.00 0.00 139.00 0.00 0.0
## 4 0.00 0.00 0.00 0.00 0.0
## 5 0.00 0.00 257.51 172.86 0.0
## 7 71.57 59.40 4.02 0.00 0.0
## gas_service_sum food_drinks_sum taxicabs_sum drug_pharma_sum
## 1 3.8 35.5 110.60 12.10
## 2 0.0 0.0 0.00 0.00
## 3 0.0 0.0 15.79 0.00
## 4 0.0 0.0 0.00 0.00
## 5 13.9 0.0 188.80 119.24
## 7 0.0 0.0 0.00 0.00
## bars_clubs_sum car_toll_parking_sum entertainment_sum utilities_sum
## 1 394.1 0 0 0
## 2 0.0 0 0 0
## 3 0.0 0 0 0
## 4 0.0 0 0 0
## 5 0.0 0 0 0
## 7 0.0 0 0 0
## subscriptions_sum bookstores_sum business_org_serv_sum hotel_lodge_sum
## 1 2.80 0.0 0.0 11.8
## 2 0.00 0.0 0.0 0.0
## 3 0.00 0.0 70.5 0.0
## 4 0.00 0.0 0.0 0.0
## 5 67.40 0.0 0.0 25.8
## 7 58.68 3.1 0.0 0.0
## computer_electronic_sum bakeries_sum gambling_gaming_sum
## 1 9.99 0.00 0
## 2 0.00 0.00 0
## 3 0.00 0.00 0
## 4 0.00 0.00 0
## 5 1574.55 2.95 0
## 7 35.34 2.20 0
## record_stores_sum digital_goods_sum airline_sum beauty_stores_sum
## 1 0 38.95 30.00 0
## 2 0 0.00 0.00 0
## 3 0 0.00 716.03 0
## 4 0 0.00 0.00 0
## 5 0 115.93 0.00 0
## 7 0 0.00 0.00 0
## transport_serv_sum fines_tax_gov_sum money_financial_sum
## 1 32.1 15 1465.3
## 2 0.0 0 0.0
## 3 0.0 0 0.0
## 4 0.0 0 0.0
## 5 0.0 0 0.0
## 7 0.0 0 0.0
## professional_serv_sum discount_stores_sum travel_agencies_sum
## 1 126.22 0.00 50.00
## 2 0.00 0.00 0.00
## 3 0.00 0.00 258.35
## 4 0.00 0.00 0.00
## 5 0.00 172.79 0.00
## 7 0.00 0.00 0.00
## computer_data_serv_sum car_rental_sum health_serv_sum
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 7 0 0 0
## advertising_serv_sum education_sum dating_serv_sum no_cat_sum cohort
## 1 0 0 0 0.00 2018-09
## 2 0 0 0 0.00 2018-10
## 3 0 0 0 183.94 2018-10
## 4 0 0 0 0.00 2018-10
## 5 0 0 0 133.90 2018-09
## 7 0 0 0 15.36 2018-09
## continent cntry_nationality round pnl_cat
## 1 EUROPE GERMANY 1694.01 pos_pnl
## 2 EUROPE AUSTRIA 23.00 pos_pnl
## 3 EUROPE AUSTRIA 2185.00 pos_pnl
## 4 EUROPE AUSTRIA -2541.52 neg_pnl
## 5 EUROPE AUSTRIA 361.00 pos_pnl
## 7 EUROPE AUSTRIA 157.00 pos_pnl
df_fin <- df1[,14:135]
# FREQUENCY OF USERS FOR EACH FEATURE
library("ggplot2")
d = NULL
for (i in 1:dim(df_fin)[2]){
feat = df_fin[,1:122]
sum = sum((feat[,i]) & (!is.na(feat[,i])) != 0)
total = dim(feat)[1]
percent = sum/total
feature = colnames(feat)[i]
print(percent)
d = rbind(d, data.frame(percent,feature))
}
## [1] 1
## [1] 0.9994203
## [1] 0.9991656
## [1] 0.1477594
## [1] 1
## [1] 0.7868147
## [1] 0.5802256
## [1] 0.2499517
## [1] 0.7516117
## [1] 0.6709031
## [1] 0.2399652
## [1] 0.1543731
## [1] 0.5966325
## [1] 0.2154777
## [1] 0.2018287
## [1] 0.05893513
## [1] 0.9975495
## [1] 0.230172
## [1] 0.2612996
## [1] 0.2613523
## [1] 0.7868147
## [1] 0.5802256
## [1] 0.2499517
## [1] 0.7516117
## [1] 0.6709031
## [1] 0.2399652
## [1] 0.1543731
## [1] 0.5966325
## [1] 0.2154777
## [1] 0.2018287
## [1] 0.05893513
## [1] 0.9975495
## [1] 0.230172
## [1] 0.9753281
## [1] 0.2124124
## [1] 0.3232034
## [1] 0.2612996
## [1] 0.2613523
## [1] 0.9602825
## [1] 1
## [1] 0.9602825
## [1] 1
## [1] 0.6697261
## [1] 0.629587
## [1] 0.7297592
## [1] 0.5890702
## [1] 0.4573401
## [1] 0.5839584
## [1] 0.6000492
## [1] 0.5276494
## [1] 0.4189927
## [1] 0.446124
## [1] 0.2404659
## [1] 0.4258612
## [1] 0.3539182
## [1] 0.2756249
## [1] 0.4293569
## [1] 0.3394259
## [1] 0.2762397
## [1] 0.319005
## [1] 0.363606
## [1] 0.3555167
## [1] 0.3753667
## [1] 0.2926028
## [1] 0.09953976
## [1] 0.1327226
## [1] 0.1921935
## [1] 0.2465702
## [1] 0.2774782
## [1] 0.1689357
## [1] 0.2429164
## [1] 0.1418483
## [1] 0.2108929
## [1] 0.156727
## [1] 0.1928874
## [1] 0.1437631
## [1] 0.09736153
## [1] 0.1535036
## [1] 0.07366452
## [1] 0.08308009
## [1] 0.01901558
## [1] 0.5390588
## [1] 0.6697261
## [1] 0.629587
## [1] 0.7297592
## [1] 0.5890702
## [1] 0.4573401
## [1] 0.5839584
## [1] 0.6000492
## [1] 0.5276494
## [1] 0.4189927
## [1] 0.446124
## [1] 0.2404659
## [1] 0.4258612
## [1] 0.3539182
## [1] 0.2756249
## [1] 0.4293569
## [1] 0.3394259
## [1] 0.2762397
## [1] 0.319005
## [1] 0.363606
## [1] 0.3555167
## [1] 0.3753667
## [1] 0.2926028
## [1] 0.09953976
## [1] 0.1327226
## [1] 0.1921935
## [1] 0.2465702
## [1] 0.2774782
## [1] 0.1689357
## [1] 0.2429164
## [1] 0.1418483
## [1] 0.2108929
## [1] 0.156727
## [1] 0.1928874
## [1] 0.1437631
## [1] 0.09736153
## [1] 0.1535036
## [1] 0.07366452
## [1] 0.08308009
## [1] 0.01901558
## [1] 0.5390588
d.sort <- d[order(d$percent,decreasing=TRUE),]
print(d.sort)
## percent feature
## 1 1.00000000 age
## 5 1.00000000 mau_act
## 40 1.00000000 n_ext_in
## 42 1.00000000 ext_in_sum
## 2 0.99942031 weeks_wau_txn
## 3 0.99916560 mau_txn
## 17 0.99754949 n_ct
## 32 0.99754949 ct_sum
## 34 0.97532805 avg_primary_bal
## 39 0.96028247 n_ext_out
## 41 0.96028247 ext_out_sum
## 6 0.78681469 n_pt_dom
## 21 0.78681469 pt_dom_sum
## 9 0.75161171 n_pt_ecomm
## 24 0.75161171 pt_ecomm_sum
## 45 0.72975917 atm
## 85 0.72975917 atm_sum
## 10 0.67090309 n_pt_dom_atm
## 25 0.67090309 pt_dom_atm_sum
## 43 0.66972614 grocery_market
## 83 0.66972614 grocery_market_sum
## 44 0.62958701 restaurant
## 84 0.62958701 restaurant_sum
## 49 0.60004919 retail
## 89 0.60004919 retail_sum
## 13 0.59663253 n_dt
## 28 0.59663253 dt_sum
## 46 0.58907021 fast_food
## 86 0.58907021 fast_food_sum
## 48 0.58395840 clothing
## 88 0.58395840 clothing_sum
## 7 0.58022555 n_pt_intra
## 22 0.58022555 pt_intra_sum
## 82 0.53905879 no_cat
## 122 0.53905879 no_cat_sum
## 50 0.52764945 household
## 90 0.52764945 household_sum
## 47 0.45734010 local_transport
## 87 0.45734010 local_transport_sum
## 52 0.44612398 food_drinks
## 92 0.44612398 food_drinks_sum
## 57 0.42935690 entertainment
## 97 0.42935690 entertainment_sum
## 54 0.42586119 drug_pharma
## 94 0.42586119 drug_pharma_sum
## 51 0.41899275 gas_service
## 91 0.41899275 gas_service_sum
## 63 0.37536670 computer_electronic
## 103 0.37536670 computer_electronic_sum
## 61 0.36360602 business_org_serv
## 101 0.36360602 business_org_serv_sum
## 62 0.35551671 hotel_lodge
## 102 0.35551671 hotel_lodge_sum
## 55 0.35391818 bars_clubs
## 95 0.35391818 bars_clubs_sum
## 58 0.33942593 utilities
## 98 0.33942593 utilities_sum
## 36 0.32320340 n_spaces
## 60 0.31900504 bookstores
## 100 0.31900504 bookstores_sum
## 64 0.29260281 bakeries
## 104 0.29260281 bakeries_sum
## 69 0.27747817 beauty_stores
## 109 0.27747817 beauty_stores_sum
## 59 0.27623975 subscriptions
## 99 0.27623975 subscriptions_sum
## 56 0.27562492 car_toll_parking
## 96 0.27562492 car_toll_parking_sum
## 20 0.26135226 n_space_dt
## 38 0.26135226 avg_space_dt
## 19 0.26129956 n_space_ct
## 37 0.26129956 avg_space_ct
## 8 0.24995169 n_pt_inter
## 23 0.24995169 pt_inter_sum
## 68 0.24657017 airline
## 108 0.24657017 airline_sum
## 71 0.24291637 fines_tax_gov
## 111 0.24291637 fines_tax_gov_sum
## 53 0.24046586 taxicabs
## 93 0.24046586 taxicabs_sum
## 11 0.23996522 n_pt_intra_atm
## 26 0.23996522 pt_intra_atm_sum
## 18 0.23017197 n_wu
## 33 0.23017197 wu_sum
## 14 0.21547772 n_dd
## 29 0.21547772 dd_sum
## 35 0.21241239 avg_spaces_bal
## 73 0.21089290 professional_serv
## 113 0.21089290 professional_serv_sum
## 15 0.20182866 n_ft
## 30 0.20182866 ft_sum
## 75 0.19288738 travel_agencies
## 115 0.19288738 travel_agencies_sum
## 67 0.19219351 digital_goods
## 107 0.19219351 digital_goods_sum
## 70 0.16893565 transport_serv
## 110 0.16893565 transport_serv_sum
## 74 0.15672704 discount_stores
## 114 0.15672704 discount_stores_sum
## 12 0.15437314 n_pt_inter_atm
## 27 0.15437314 pt_inter_atm_sum
## 78 0.15350361 health_serv
## 118 0.15350361 health_serv_sum
## 4 0.14775941 months_sau
## 76 0.14376306 computer_data_serv
## 116 0.14376306 computer_data_serv_sum
## 72 0.14184833 money_financial
## 112 0.14184833 money_financial_sum
## 66 0.13272261 record_stores
## 106 0.13272261 record_stores_sum
## 65 0.09953976 gambling_gaming
## 105 0.09953976 gambling_gaming_sum
## 77 0.09736153 car_rental
## 117 0.09736153 car_rental_sum
## 80 0.08308009 education
## 120 0.08308009 education_sum
## 79 0.07366452 advertising_serv
## 119 0.07366452 advertising_serv_sum
## 16 0.05893513 n_cash26
## 31 0.05893513 cash26_sum
## 81 0.01901558 dating_serv
## 121 0.01901558 dating_serv_sum
# All defaults
ggplot(d, aes(x=reorder(feature, -percent), y=percent, fill=feature, label=scales::percent(percent)))+
geom_bar(stat="identity", width=0.5) + ylab('Percent of Non-Zero Counts') +
xlab('Features') +
theme(axis.text.x = element_text(angle = 45, size=4, face='bold'), legend.position='none') +
geom_text(position = position_dodge(width = .9), # move to center of bars
vjust = -0.5, # nudge above top of bar
size = 1) +
scale_y_continuous(labels = scales::percent)
## Results
There are a few low frequency features that will not provide any meaningful variation to the analysis so we will drop them as they create NA’s in PCA.
# REMOVE low frequency features < 2%
rm.feat <- d[d[,1] <0.021,]
df_fin <- df_fin[,!colnames(df_fin) %in% rm.feat[,2]]
#df_fin$months_sau <- NULL
dim(df_fin)
## [1] 113854 120
# remove SAU feature
# df_fin$months_sau <- NULL
# NOTE: tried both removing and including SAU feature and there were no significant differences in the final behavioral groupings but there were marginal improvements in classifying users so I decided to include this feature for the final results.
names(df_fin)
## [1] "age" "weeks_wau_txn"
## [3] "mau_txn" "months_sau"
## [5] "mau_act" "n_pt_dom"
## [7] "n_pt_intra" "n_pt_inter"
## [9] "n_pt_ecomm" "n_pt_dom_atm"
## [11] "n_pt_intra_atm" "n_pt_inter_atm"
## [13] "n_dt" "n_dd"
## [15] "n_ft" "n_cash26"
## [17] "n_ct" "n_wu"
## [19] "n_space_ct" "n_space_dt"
## [21] "pt_dom_sum" "pt_intra_sum"
## [23] "pt_inter_sum" "pt_ecomm_sum"
## [25] "pt_dom_atm_sum" "pt_intra_atm_sum"
## [27] "pt_inter_atm_sum" "dt_sum"
## [29] "dd_sum" "ft_sum"
## [31] "cash26_sum" "ct_sum"
## [33] "wu_sum" "avg_primary_bal"
## [35] "avg_spaces_bal" "n_spaces"
## [37] "avg_space_ct" "avg_space_dt"
## [39] "n_ext_out" "n_ext_in"
## [41] "ext_out_sum" "ext_in_sum"
## [43] "grocery_market" "restaurant"
## [45] "atm" "fast_food"
## [47] "local_transport" "clothing"
## [49] "retail" "household"
## [51] "gas_service" "food_drinks"
## [53] "taxicabs" "drug_pharma"
## [55] "bars_clubs" "car_toll_parking"
## [57] "entertainment" "utilities"
## [59] "subscriptions" "bookstores"
## [61] "business_org_serv" "hotel_lodge"
## [63] "computer_electronic" "bakeries"
## [65] "gambling_gaming" "record_stores"
## [67] "digital_goods" "airline"
## [69] "beauty_stores" "transport_serv"
## [71] "fines_tax_gov" "money_financial"
## [73] "professional_serv" "discount_stores"
## [75] "travel_agencies" "computer_data_serv"
## [77] "car_rental" "health_serv"
## [79] "advertising_serv" "education"
## [81] "no_cat" "grocery_market_sum"
## [83] "restaurant_sum" "atm_sum"
## [85] "fast_food_sum" "local_transport_sum"
## [87] "clothing_sum" "retail_sum"
## [89] "household_sum" "gas_service_sum"
## [91] "food_drinks_sum" "taxicabs_sum"
## [93] "drug_pharma_sum" "bars_clubs_sum"
## [95] "car_toll_parking_sum" "entertainment_sum"
## [97] "utilities_sum" "subscriptions_sum"
## [99] "bookstores_sum" "business_org_serv_sum"
## [101] "hotel_lodge_sum" "computer_electronic_sum"
## [103] "bakeries_sum" "gambling_gaming_sum"
## [105] "record_stores_sum" "digital_goods_sum"
## [107] "airline_sum" "beauty_stores_sum"
## [109] "transport_serv_sum" "fines_tax_gov_sum"
## [111] "money_financial_sum" "professional_serv_sum"
## [113] "discount_stores_sum" "travel_agencies_sum"
## [115] "computer_data_serv_sum" "car_rental_sum"
## [117] "health_serv_sum" "advertising_serv_sum"
## [119] "education_sum" "no_cat_sum"
# normalize COUNT DATA by ROW SUM --> to get relative importance of trxn count data for each user
df_cnt_row <- df_fin
# Make count data relative proportions by dividing by total transaction count for each user
names(df_cnt_row[,c(5:19)]) # txn count
## [1] "mau_act" "n_pt_dom" "n_pt_intra" "n_pt_inter"
## [5] "n_pt_ecomm" "n_pt_dom_atm" "n_pt_intra_atm" "n_pt_inter_atm"
## [9] "n_dt" "n_dd" "n_ft" "n_cash26"
## [13] "n_ct" "n_wu" "n_space_ct"
names(df_cnt_row[,c(43:81)]) # mcc count
## [1] "grocery_market" "restaurant" "atm"
## [4] "fast_food" "local_transport" "clothing"
## [7] "retail" "household" "gas_service"
## [10] "food_drinks" "taxicabs" "drug_pharma"
## [13] "bars_clubs" "car_toll_parking" "entertainment"
## [16] "utilities" "subscriptions" "bookstores"
## [19] "business_org_serv" "hotel_lodge" "computer_electronic"
## [22] "bakeries" "gambling_gaming" "record_stores"
## [25] "digital_goods" "airline" "beauty_stores"
## [28] "transport_serv" "fines_tax_gov" "money_financial"
## [31] "professional_serv" "discount_stores" "travel_agencies"
## [34] "computer_data_serv" "car_rental" "health_serv"
## [37] "advertising_serv" "education" "no_cat"
df_cnt_row[,c(6:20)] <- df_cnt_row[,c(6:20)]/rowSums(df_cnt_row[,c(5:19)])
df_cnt_row[,c(43:81)] <- df_cnt_row[,c(43:81)]/rowSums(df_cnt_row[,c(43:81)])
df_cnt_row[is.na(df_cnt_row)] <- 0
### Transforming skewed data
# log transform total volume for each transaction type
names(df_cnt_row[,c(21:35)])
## [1] "pt_dom_sum" "pt_intra_sum" "pt_inter_sum"
## [4] "pt_ecomm_sum" "pt_dom_atm_sum" "pt_intra_atm_sum"
## [7] "pt_inter_atm_sum" "dt_sum" "dd_sum"
## [10] "ft_sum" "cash26_sum" "ct_sum"
## [13] "wu_sum" "avg_primary_bal" "avg_spaces_bal"
names(df_cnt_row[,c(82:120)])
## [1] "grocery_market_sum" "restaurant_sum"
## [3] "atm_sum" "fast_food_sum"
## [5] "local_transport_sum" "clothing_sum"
## [7] "retail_sum" "household_sum"
## [9] "gas_service_sum" "food_drinks_sum"
## [11] "taxicabs_sum" "drug_pharma_sum"
## [13] "bars_clubs_sum" "car_toll_parking_sum"
## [15] "entertainment_sum" "utilities_sum"
## [17] "subscriptions_sum" "bookstores_sum"
## [19] "business_org_serv_sum" "hotel_lodge_sum"
## [21] "computer_electronic_sum" "bakeries_sum"
## [23] "gambling_gaming_sum" "record_stores_sum"
## [25] "digital_goods_sum" "airline_sum"
## [27] "beauty_stores_sum" "transport_serv_sum"
## [29] "fines_tax_gov_sum" "money_financial_sum"
## [31] "professional_serv_sum" "discount_stores_sum"
## [33] "travel_agencies_sum" "computer_data_serv_sum"
## [35] "car_rental_sum" "health_serv_sum"
## [37] "advertising_serv_sum" "education_sum"
## [39] "no_cat_sum"
df_cnt_row[df_cnt_row[,34] < 0,] <- 0 #replace negative balances with 0 before log transform
df_cnt_row[,c(21:35)] <- df_cnt_row[,c(21:35)] + 1 # add 1 to all data point bc there are 0's
df_cnt_row[,c(21:35)] <- log(df_cnt_row[,c(21:35)])
df_cnt_row[,c(82:120)] <- df_cnt_row[,c(82:120)] + 1
df_cnt_row[,c(82:120)] <- log(df_cnt_row[,c(82:120)])
#### FEATURES to include in Analysis
#txn_mcc = df_cnt_row[,c(2:35,43:122)]
#mcc_sum = df_cnt_row[,c(81:119)]
txn = df_cnt_row[,c(2:35)]
set.seed(42) # This is to keep the sampling consistent for this example. Remove it to test different random samples, I ran this a few times with different random samples to see how consistent the results are with the samples size to ensure that I'm sampling enough to capture the variation in the complete data set.
users <-rownames(txn)
sample <- sample(users, size=20000)
df_sample = txn[rownames(txn) %in% sample,]
library("FactoMineR")
library("factoextra")
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.pca <- PCA(df_sample, graph=FALSE,ncp=50, scale.unit=T)
eig.val <- get_eigenvalue(res.pca)
head(eig.val,15)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 8.4359679 24.811670 24.81167
## Dim.2 2.9387129 8.643273 33.45494
## Dim.3 2.3562486 6.930143 40.38509
## Dim.4 2.0738707 6.099620 46.48471
## Dim.5 1.9946992 5.866762 52.35147
## Dim.6 1.7373469 5.109844 57.46131
## Dim.7 1.5484633 4.554304 62.01562
## Dim.8 1.3849199 4.073294 66.08891
## Dim.9 1.2495688 3.675202 69.76411
## Dim.10 1.2367415 3.637475 73.40159
## Dim.11 1.1380728 3.347273 76.74886
## Dim.12 0.8872221 2.609477 79.35834
## Dim.13 0.8111718 2.385799 81.74414
## Dim.14 0.7697510 2.263974 84.00811
## Dim.15 0.6447407 1.896296 85.90441
# variance explained by each dimension/component
# ~40% of variance is explained by the first 3 components
fviz_eig(res.pca, addlabels = T, ylim=c(0,70))
var <- get_pca_var(res.pca)
# PCA correlation plot of features relative to the dimensions
library("corrplot")
## corrplot 0.84 loaded
corrplot(var$cos2, is.corr=FALSE,tl.cex=0.80)
library(scatterplot3d)
pc <- res.pca$ind$coord
scatterplot3d(pc[,1:3], pch=3, color="blue")
plot(eig.val[,3], xlab='Dimensions',ylab='Cumulative Variance',pch=19)
# head(eig.val,15)
# fviz_eig(res.pca, addlabels = T, ylim=c(0,70))
# corrplot(var$cos2, is.corr=FALSE,tl.cex=0.80)
# scatterplot3d(pc[,1:3], pch=3, color="blue")
# plot(eig.val[,3], xlab='Dimensions',ylab='Cumulative Variance',pch=19)
I used PCA to reduce the complexity of the data. By implementing PCA on the data before fitting the clustering model, we can reduce the complexity of the the clustering model and avoid overfitting (e.g. fitting noise) as well as improve the performance of the clustering model. I’ve fed in 35 features that result in 35 independent dimensions with each dimension capturing variation between correlated features. For instance, dim.3 captures that correlation between n_spaces_ct and n_space_dt. Note that each dim have orthogonal relationships thus completely independent of one another.
We can capture ~86% of the variance in the data by just using the first 15 PC’s. Based on the PCA correlation plots, PC 16-35 does not seem to capture much meaningful variation among the features so we can drop them.
# biplot with variables and individuals
# biplot for dim 1 and 2
fviz_pca_biplot(res.pca, repel = T,geom.ind = "point", axes=c(1,2),
col.var = 'black', # Variables color
col.ind = 'grey' # Individuals color
)
# biplot for dim 1 and 3
fviz_pca_biplot(res.pca, repel = T,geom.ind = "point", axes=c(1,3),
col.var = 'black', # Variables color
col.ind = 'grey' # Individuals color
)
# biplot for dim 2 and 3
fviz_pca_biplot(res.pca, repel = T,geom.ind = "point", axes=c(2,3),
col.var = 'black', # Variables color
col.ind = 'grey' # Individuals color
)
# biplot for dim 2 and 5
fviz_pca_biplot(res.pca, repel = T,geom.ind = "point", axes=c(2,5),
col.var = 'black', # Variables color
col.ind = 'grey' # Individuals color
)
## Results
These Biplots are 2 dimensional representations of the different principle components (dim) that describe the relationships between the features, which are represented by vectors. You can look at all different combinations of dims to investigate the relationships between the features. Its helpful to look at these plots in combination with the PCA correlation plot above. Grey dots represent the users. -High postively correlated features are clustered/grouped together and vectors are pointing in the same direction. -Negatively correlated features are positioned on opposite sides of the plot (arrows pointing in opposite directions) -Vectors that are far away from the origin (longest arrows) are well represented on the two dimensions (i.e dim1 and dim2 shows a strong relationship between mau_txn, pt_dom_atm_sum, ct_sum etc.)
# compute correlation matrix and rm NA's
#df.scaled = scale(df_fin, center=TRUE, scale=TRUE) # non-normalized for rowsums
df.scaled = scale(txn, center=TRUE, scale=TRUE)
df.scaled = scale(df_cnt_row, center=TRUE, scale=TRUE)
res.cor <- cor(df.scaled)
library('corrplot')
corrplot(res.cor, type="upper", order="hclust",
tl.col="black",
tl.srt=45,
diag=FALSE,
tl.cex=0.50,
#addCoef.col="black"
)
pc <- res.pca$ind$coord
# KMEANS CLUSTERING
library(factoextra)
comp <- pc[,1:15]
# K-means clustering
# Determine the correct number of clusters via weighted within sum of squares
gc() # Garbage collect
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 1194182 63.8 2378506 127.1 NA 2378506 127.1
## Vcells 81066594 618.5 166666367 1271.6 1024000 161958642 1235.7
wss <- (nrow(comp)-1)*sum(apply(comp,2,var))
for (i in 2:50) wss[i] <- sum(kmeans(comp, centers=i, nstart=25, iter.max=1000,algorithm="MacQueen")$withinss)
plot(1:50, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares", main="K-means: Identify optimal number of clusters",xaxt="n")
axis(1, at=1:50, labels=c(1:50))
# Elbow method indicates k = 10
Some of the requirements for Kmeans clustering is… i) you must specify the number of clusters beforehand ii) the initial centroids of the clusters are chosen randomly
Because the intial centroids are randomly chosen, the output of kmeans clusters can vary between runs. To get around this issue, we can implement a hybrid of hierarchical/kmean algorithm.
Identifying the optimal number of clusters is a hard problem to solve. While there are a variety of methods that we can be used to identify the optimal number of clusters, for our first attempt at clustering, we can just use the simplest approach, the elbow method. This method looks at the variability of within groups (within groups sum of squares) as a function of the number of clusters. The goal is to strike a balance between minizing the variation within clusters and not dividing users into too many groups that don’t make much sense.
The elbow of the curve gives us an estimate of the optimal number of clusters, which is k = 8 - 11 clusters. As seen here, this method cannot always unambigiously identify the optimal number of clusters particularly for data with overlapping clusters. Based on the PCA results, we can see that the observations are densely packed and the clusters in this data set could potentially have a relatively high degree of overlap. As a result, we’ll use another method to identify test this range of k values to determine the best number of clusters.
# Note that the cluster numbers are assigned randomly and will be different each time you rerun the clustering analysis so you will have look at the data to determine the cluster behaviors and its corresponding cluster number.
# k=7
res.hc <- eclust(comp, "hclust", k = 7,
method = "ward.D2", graph = FALSE)
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_7 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_7)
## cluster size ave.sil.width
## 1 1 4207 0.20
## 2 2 4403 0.09
## 3 3 2326 0.04
## 4 4 1573 0.09
## 5 5 1374 0.16
## 6 6 4318 0.19
## 7 7 1799 0.17
# k=8
res.hc <- eclust(comp, "hclust", k = 8,
method = "ward.D2", graph = FALSE)
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_8 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_8)
## cluster size ave.sil.width
## 1 1 4173 0.19
## 2 2 4223 0.10
## 3 3 364 0.14
## 4 4 1566 0.09
## 5 5 1359 0.16
## 6 6 4294 0.18
## 7 7 2218 0.08
## 8 8 1803 0.16
#k=9
res.hc <- eclust(comp, "hclust", k = 9,
method = "ward.D2", graph = FALSE)
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_9 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_9)
## cluster size ave.sil.width
## 1 1 2719 0.38
## 2 2 3938 0.09
## 3 3 356 0.14
## 4 4 1541 0.09
## 5 5 1341 0.16
## 6 6 4216 0.16
## 7 7 2163 0.08
## 8 8 1795 0.16
## 9 9 1931 0.04
#k=10
res.hc <- eclust(comp, "hclust", k = 10,
method = "ward.D2", graph = FALSE)
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_10 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_10)
## cluster size ave.sil.width
## 1 1 2704 0.37
## 2 2 3757 0.10
## 3 3 348 0.14
## 4 4 1534 0.09
## 5 5 1337 0.16
## 6 6 4198 0.16
## 7 7 2134 0.08
## 8 8 1788 0.16
## 9 9 1342 0.21
## 10 10 858 0.18
#k=11
res.hc <- eclust(comp, "hclust", k = 11,
method = "ward.D2", graph = FALSE)
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_11 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_11)
## cluster size ave.sil.width
## 1 1 2377 0.41
## 2 2 2645 0.14
## 3 3 330 0.14
## 4 4 1475 0.08
## 5 5 1342 0.15
## 6 6 4181 0.17
## 7 7 1641 0.16
## 8 8 2179 0.08
## 9 9 1764 0.15
## 10 10 1292 0.21
## 11 11 774 0.19
# fviz_silhouette(km.res2_7)
# fviz_silhouette(km.res2_8)
# fviz_silhouette(km.res2_9)
# fviz_silhouette(km.res2_10)
# fviz_silhouette(km.res2_11)
Silohuette plots is another method of evaluating the optimal K value and determining the quality of the clusters. Silohouette widths provides information on the goodness of clustering -high positive values correspond to users that match well to a cluster and poorly with the neighboring clusters -negative values correspond to users that do not have a high degree of membership confidence and are users that fall on the boardline of clusters potentially belonging to more than one cluster (reference Appendix for an example of this).
As I have mentioned above, the clusters are quite dense and not clearly separated so there are clusters that overlap (reference Appendix) resulting in users belonging to more than one cluster. In the future, it would be ideal to implement a probablistic model that can assign probabilities of cluster membership to each user. But for an exploratory analysis, I think using this model will suffice and provide some useful insights on the key behavioral groups that exist within this cohort.
# total within cluster sum of squares is an estimate of the % within variation / total variation within clusters-- we want to minimize this number
# here we look at the reduction of total within sum of squares to look at marginal improvements between k values
print('% within cluster variation')
## [1] "% within cluster variation"
km.res2_7$tot.withinss/km.res2_7$totss #k=7
## [1] 0.5681779
km.res2_8$tot.withinss/km.res2_8$totss #k=8
## [1] 0.5405174
km.res2_9$tot.withinss/km.res2_9$totss #k=9
## [1] 0.5168768
km.res2_10$tot.withinss/km.res2_10$totss #k=10
## [1] 0.491871
km.res2_11$tot.withinss/km.res2_11$totss #k=11
## [1] 0.4716877
print('an adhoc estimate of improvement based on the differences between % within cluster variation')
## [1] "an adhoc estimate of improvement based on the differences between % within cluster variation"
km.res2_7$tot.withinss/km.res2_7$totss - km.res2_8$tot.withinss/km.res2_8$totss
## [1] 0.0276605
km.res2_8$tot.withinss/km.res2_8$totss - km.res2_9$tot.withinss/km.res2_9$totss
## [1] 0.02364059
km.res2_9$tot.withinss/km.res2_9$totss - km.res2_10$tot.withinss/km.res2_10$totss
## [1] 0.0250058
km.res2_10$tot.withinss/km.res2_10$totss - km.res2_11$tot.withinss/km.res2_11$totss
## [1] 0.02018335
# Comparing the avg cluster widths of clusters -- note that all of the values are in sequential order k= 1 to k = i
print('Average Silohuette width for each clusters K=9 and 10')
## [1] "Average Silohuette width for each clusters K=9 and 10"
km.res2_7$silinfo$clus.avg.widths
## [1] 0.19729105 0.08702396 0.04438300 0.08914760 0.15816017 0.18879858
## [7] 0.16833767
km.res2_8$silinfo$clus.avg.widths
## [1] 0.19345185 0.10188029 0.13525616 0.09016755 0.15876354 0.17941630
## [7] 0.07730085 0.16442455
km.res2_9$silinfo$clus.avg.widths
## [1] 0.38184257 0.08940424 0.13610435 0.08522383 0.15778181 0.16455655
## [7] 0.07791880 0.15953835 0.03873085
km.res2_10$silinfo$clus.avg.widths
## [1] 0.36859409 0.09661772 0.13536666 0.08529343 0.15739041 0.15655218
## [7] 0.08398400 0.16000159 0.20733814 0.18292200
km.res2_11$silinfo$clus.avg.widths
## [1] 0.41164417 0.13890345 0.13865029 0.08101639 0.15065209 0.16570058
## [7] 0.15726672 0.08012286 0.15240249 0.20696029 0.19458587
Identifying the optimal k value to settle for is a difficult problem and it ends up being a judgement call based on the patterns that emerge from the data and how the clusters split from one K value to the next. I spent a lot of time looking into the data to assess the patterns that emerged from the features across clusters and how the clusters split when the I added another k value to the model. I also looked at % within cluster variation and the corresponding changes between k-values as well as the overall avg silouette widths to judge whether adding another cluster to the model improved the clustering.
Because average silohuette width gives us an estimate of the performance of the clusters, I compared the avg silohuette widths across k=7 - k=11 model. I found that there were some improvements in the k=9,10,11 model relative to k=7 and k=8.
I looked at the features and how it relates to differentiating the clusters (reference the barplots below) for k = 9 and k=10. 8 clusters show consistent behavioral patterns between k=9 and k=10 (i.e. secondary spenders, barely active users, international travelers, holding account, spaces power users, primary account, unconvinced, euro/international travelers, cash26ers). K=10 extracted a 10th group that corresponds to low activity friend referal users… which I thought was an interesting group.
I also looked into k=11 and didn’t really see any meaningful groups arise from dividing the data into an additional cluster.
As a result, I settled for k=10.
# k = 9
# names(km.res2_9)
# hk.df <- data.frame(km.res2_9$cluster)
# colnames(hk.df) <- c("cluster")
names(km.res2_10)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault" "silinfo" "nbclust" "data"
hk.df <- data.frame(km.res2_10$cluster)
colnames(hk.df) <- c("cluster")
########### barplots for mean values of clusters
merged.df <- transform(merge(df,hk.df,by=0), row.names=Row.names, Row.names=NULL)
merged.df[,141]<-as.factor(merged.df[,141])
merged.df$round <- merged.df$round/100
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
merged.df$sau_cat <- case_when(merged.df$months_sau > 0 ~ 'SAU',
merged.df$months_sau == 0 ~ 'NON-SAU')
head(merged.df)
## id user_created kycc membership
## 1 1 2015-05-25 18:23:10 2018-09-03 15:29:47 standard
## 10001 10001 2018-08-28 18:29:53 2018-09-25 14:56:08 standard
## 10002 10002 2018-08-28 18:30:59 2018-09-03 10:19:14 standard
## 10003 10003 2018-08-28 18:43:06 2018-09-01 15:21:16 standard
## 100053 100053 2018-10-03 06:17:59 2018-10-03 06:48:06 standard
## 100066 100066 2018-10-03 06:36:58 2018-10-03 06:54:24 standard
## product_id market nationality nat_status referral closed_at
## 1 STANDARD DEU DEU native
## 10001 STANDARD FRA FRA native paid_search
## 10002 STANDARD FRA FRA native friend_referral
## 10003 STANDARD DEU DEU native
## 100053 STANDARD FRA FRA native paid_search
## 100066 STANDARD FRA FRA native app_store 2020-01-14
## kycc_closed_days gender age_group age weeks_wau_txn mau_txn
## 1 0 MALE 25-29 30 40 12
## 10001 0 FEMALE 25-29 30 41 11
## 10002 0 FEMALE 25-29 30 14 5
## 10003 0 MALE 50-54 51 1 1
## 100053 0 FEMALE 35-39 37 39 9
## 100066 468 MALE 20-24 24 31 9
## months_sau mau_act n_pt_dom n_pt_intra n_pt_inter n_pt_ecomm
## 1 0 12 8 105 0 53
## 10001 0 5 35 7 168 30
## 10002 1 4 5 16 58 4
## 10003 0 1 0 0 0 0
## 100053 0 4 161 8 22 12
## 100066 8 9 175 1 0 40
## n_pt_dom_atm n_pt_intra_atm n_pt_inter_atm n_dt n_dd n_ft n_cash26
## 1 2 2 0 0 0 1 0
## 10001 0 0 18 2 0 0 0
## 10002 2 1 3 0 0 12 0
## 10003 0 0 0 0 0 0 0
## 100053 6 0 1 0 0 0 0
## 100066 7 0 0 9 0 0 0
## n_ct n_wu n_space_ct n_space_dt pt_dom_sum pt_intra_sum
## 1 23 0 5 5 124.73 1143.68
## 10001 6 0 0 0 405.95 14.44
## 10002 20 1 0 0 33.72 419.50
## 10003 1 0 0 0 0.00 0.00
## 100053 6 0 0 0 3504.54 368.41
## 100066 13 1 61 61 2264.30 4.50
## pt_inter_sum pt_ecomm_sum pt_dom_atm_sum pt_intra_atm_sum
## 1 0.00 1693.19 20 70.00
## 10001 2495.37 3329.95 0 0.00
## 10002 1638.73 143.07 100 54.98
## 10003 0.00 0.00 0 0.00
## 100053 1466.41 494.22 160 0.00
## 100066 0.00 1257.81 760 0.00
## pt_inter_atm_sum dt_sum dd_sum ft_sum cash26_sum ct_sum wu_sum
## 1 0.00 0.00 0 3.25 0 2815.76 0
## 10001 750.14 2854.26 0 0.00 0 9849.20 0
## 10002 268.15 0.00 0 309.01 0 3165.00 15
## 10003 0.00 0.00 0 0.00 0 100.00 0
## 100053 481.44 0.00 0 0.00 0 5900.00 0
## 100066 0.00 5430.00 0 0.00 0 9850.79 15
## avg_primary_bal avg_spaces_bal n_spaces avg_space_ct avg_space_dt
## 1 -86.437 0.000 2 28.36879 28.36879
## 10001 632.558 0.000 1 0.00000 0.00000
## 10002 168.195 0.000 2 0.00000 0.00000
## 10003 100.000 0.000 0 0.00000 0.00000
## 100053 746.481 0.000 0 0.00000 0.00000
## 100066 70.920 23.496 11 3337.64706 3337.64706
## n_ext_out n_ext_in ext_out_sum ext_in_sum grocery_market restaurant
## 1 171 24 3054.85 2817.26 14 20
## 10001 260 10 9850.11 9916.19 54 39
## 10002 101 21 2967.16 3180.00 5 11
## 10003 0 1 0.00 100.00 0 0
## 100053 210 8 6475.02 6597.52 16 8
## 100066 232 15 9716.61 9885.74 44 5
## atm fast_food local_transport clothing retail household gas_service
## 1 4 9 4 0 4 2 1
## 10001 18 31 7 3 2 3 1
## 10002 6 7 6 25 5 1 4
## 10003 0 0 0 0 0 0 0
## 100053 7 28 5 2 0 11 1
## 100066 7 45 10 4 13 5 0
## food_drinks taxicabs drug_pharma bars_clubs car_toll_parking
## 1 9 10 2 36 0
## 10001 11 8 10 4 0
## 10002 0 2 3 0 2
## 10003 0 0 0 0 0
## 100053 74 7 0 1 0
## 100066 2 0 2 3 0
## entertainment utilities subscriptions bookstores business_org_serv
## 1 0 0 3 0 0
## 10001 1 4 0 0 0
## 10002 0 0 0 0 0
## 10003 0 0 0 0 0
## 100053 7 1 0 3 6
## 100066 13 0 1 1 23
## hotel_lodge computer_electronic bakeries gambling_gaming
## 1 1 1 0 0
## 10001 25 0 6 0
## 10002 3 0 0 0
## 10003 0 0 0 0
## 100053 0 1 6 0
## 100066 2 6 9 0
## record_stores digital_goods airline beauty_stores transport_serv
## 1 0 4 1 0 4
## 10001 0 0 2 0 7
## 10002 0 0 0 2 0
## 10003 0 0 0 0 0
## 100053 0 0 0 8 0
## 100066 0 6 0 3 0
## fines_tax_gov money_financial professional_serv discount_stores
## 1 1 35 4 0
## 10001 5 1 0 3
## 10002 2 0 1 1
## 10003 0 0 0 0
## 100053 2 0 2 0
## 100066 0 0 15 0
## travel_agencies computer_data_serv car_rental health_serv
## 1 1 0 0 0
## 10001 2 0 0 1
## 10002 0 0 0 0
## 10003 0 0 0 0
## 100053 1 3 3 4
## 100066 0 0 0 0
## advertising_serv education dating_serv no_cat grocery_market_sum
## 1 0 0 0 0 178.34
## 10001 0 0 0 10 489.12
## 10002 0 0 0 3 51.94
## 10003 0 0 0 0 0.00
## 100053 0 0 0 3 321.17
## 100066 0 0 0 4 511.95
## restaurant_sum atm_sum fast_food_sum local_transport_sum
## 1 298.70 90.00 60.20 45.85
## 10001 496.27 750.14 215.86 29.85
## 10002 274.32 423.13 73.64 55.89
## 10003 0.00 0.00 0.00 0.00
## 100053 131.35 641.44 393.21 18.11
## 100066 65.30 760.00 476.49 221.90
## clothing_sum retail_sum household_sum gas_service_sum
## 1 0.00 18.75 21.50 3.80
## 10001 33.88 590.21 61.94 1.60
## 10002 750.47 91.56 24.05 64.62
## 10003 0.00 0.00 0.00 0.00
## 100053 171.09 0.00 949.60 10.71
## 100066 410.34 155.56 105.70 0.00
## food_drinks_sum taxicabs_sum drug_pharma_sum bars_clubs_sum
## 1 35.50 110.60 12.10 394.1
## 10001 90.49 98.69 63.03 67.5
## 10002 0.00 31.84 43.87 0.0
## 10003 0.00 0.00 0.00 0.0
## 100053 122.94 228.46 0.00 10.0
## 100066 25.00 0.00 32.86 42.0
## car_toll_parking_sum entertainment_sum utilities_sum
## 1 0.00 0.00 0.00
## 10001 0.00 6.38 60.04
## 10002 22.61 0.00 0.00
## 10003 0.00 0.00 0.00
## 100053 0.00 499.98 14.21
## 100066 0.00 176.37 0.00
## subscriptions_sum bookstores_sum business_org_serv_sum
## 1 2.8 0.00 0.00
## 10001 0.0 0.00 0.00
## 10002 0.0 0.00 0.00
## 10003 0.0 0.00 0.00
## 100053 0.0 18.86 90.49
## 100066 49.0 17.00 360.48
## hotel_lodge_sum computer_electronic_sum bakeries_sum
## 1 11.80 9.99 0.00
## 10001 1922.14 0.00 65.41
## 10002 522.74 0.00 0.00
## 10003 0.00 0.00 0.00
## 100053 0.00 75.00 35.40
## 100066 13.40 133.07 67.08
## gambling_gaming_sum record_stores_sum digital_goods_sum airline_sum
## 1 0 0 38.95 30.0
## 10001 0 0 0.00 350.7
## 10002 0 0 0.00 0.0
## 10003 0 0 0.00 0.0
## 100053 0 0 0.00 0.0
## 100066 0 0 101.93 0.0
## beauty_stores_sum transport_serv_sum fines_tax_gov_sum
## 1 0.0 32.10 15.00
## 10001 0.0 14.44 47.31
## 10002 65.2 0.00 3.80
## 10003 0.0 0.00 0.00
## 100053 1067.2 0.00 55.83
## 100066 57.0 0.00 0.00
## money_financial_sum professional_serv_sum discount_stores_sum
## 1 1465.30 126.22 0.00
## 10001 628.41 0.00 60.00
## 10002 0.00 8.77 5.97
## 10003 0.00 0.00 0.00
## 100053 0.00 14.05 0.00
## 100066 0.00 188.74 0.00
## travel_agencies_sum computer_data_serv_sum car_rental_sum
## 1 50.00 0 0.00
## 10001 725.83 0 0.00
## 10002 0.00 0 0.00
## 10003 0.00 0 0.00
## 100053 135.39 50 213.82
## 100066 0.00 0 0.00
## health_serv_sum advertising_serv_sum education_sum dating_serv_sum
## 1 0.0 0 0 0
## 10001 51.5 0 0 0
## 10002 0.0 0 0 0
## 10003 0.0 0 0 0
## 100053 397.0 0 0 0
## 100066 0.0 0 0 0
## no_cat_sum cohort continent cntry_nationality round pnl_cat
## 1 0.00 2018-09 EUROPE GERMANY 16.9401 pos_pnl
## 10001 75.11 2018-09 EUROPE FRANCE 67.8300 pos_pnl
## 10002 143.73 2018-09 EUROPE FRANCE 24.3000 pos_pnl
## 10003 0.00 2018-09 EUROPE GERMANY -1.0000 neg_pnl
## 100053 809.71 2018-10 EUROPE FRANCE 4.9600 pos_pnl
## 100066 315.44 2018-10 EUROPE FRANCE 203.5689 pos_pnl
## cluster sau_cat
## 1 1 NON-SAU
## 10001 8 NON-SAU
## 10002 8 SAU
## 10003 1 NON-SAU
## 100053 8 NON-SAU
## 100066 5 SAU
demo.df <- merged.df[,c(4:9,12:13,136:142)]
library('reshape2')
library('reshape')
##
## Attaching package: 'reshape'
## The following objects are masked from 'package:reshape2':
##
## colsplit, melt, recast
## The following object is masked from 'package:dplyr':
##
## rename
library('tidyr')
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:reshape':
##
## expand, smiths
## The following object is masked from 'package:reshape2':
##
## smiths
library('dplyr')
library(ggplot2)
library(ggplot2)
cluster.freq<- as.data.frame(table(merged.df[,141]))
cluster.freq['percent'] <- cluster.freq[,2]/200
cluster.freq
## Var1 Freq percent
## 1 1 2704 13.520
## 2 2 3757 18.785
## 3 3 348 1.740
## 4 4 1534 7.670
## 5 5 1337 6.685
## 6 6 4198 20.990
## 7 7 2134 10.670
## 8 8 1788 8.940
## 9 9 1342 6.710
## 10 10 858 4.290
p <- ggplot(cluster.freq, aes(y=percent,x=as.factor(Var1),fill=as.factor(Var1))) +
geom_bar(position="stack", stat="identity") + xlab("Clusters") + ylab("Percent") +
theme(
#legend.title = element_text(color = "black", size = 16),
#legend.text = element_text(color = "black", size = 10,face="bold"),
#legend.position='top',
axis.text=element_text(size=16,face="bold")
) + geom_text(aes(label = round(percent,2), y = percent + 0.40), position = position_dodge(0.9),format_string='{:.1f}% ')
## Warning: Ignoring unknown parameters: format_string
p + scale_fill_manual(values=c("#48AC98", "#E5C3C7","#CB7C7A", "#CAD7CA","#CDA35F","#C8D7E5","#266678","#F5D5B9","#737373","#CCCCCC",
"#D3D3D3",
"#999999",
"#8DA290","#CAA7BD",
"#B4BAD4",
'#B88BAD',
"#DBC4DF",
"#D2C1CE",
"#FDF7C2"))
# Evaluating the mean/median value of the features across clusters
mydata = melt(merged.df[,c(14:25,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
med_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("median"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=as.factor(cluster))) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
ggplot(mean_df10[mean_df10[,2] %in% c('weeks_wau_txn','mau_txn','mau_act','months_sau'),], aes(x=as.factor(cluster), y=value,fill=as.factor(cluster))) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25) + theme(legend.position = "none",axis.text=element_text(size=16,face="bold")) +
facet_grid(variable ~ ., scales = "free_y") + scale_fill_manual(values=c("#48AC98", "#E5C3C7","#CB7C7A", "#CAD7CA","#CDA35F","#C8D7E5","#266678","#F5D5B9","#737373","#CCCCCC",
"#D3D3D3",
"#999999",
"#8DA290","#CAA7BD",
"#B4BAD4",
'#B88BAD',
"#DBC4DF",
"#D2C1CE",
"#FDF7C2"))
mydata = melt(merged.df[,c(21:33,139,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
med_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("median"))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
mydata = melt(merged.df[,c(34:48,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
mydata = melt(merged.df[,c(56:68,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
mydata = melt(merged.df[,c(69:82,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
mydata = melt(merged.df[,c(96:108,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
mydata = melt(merged.df[,c(109:124,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
mydata = melt(merged.df[,c(125:134,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>%
group_by(cluster,variable) %>%
summarise_all("mean"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))
ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")
sau <- melt(table(merged.df[,c(141,142)]))
sau <- cast(melt(table(merged.df[,c(142,141)])),cluster~sau_cat)
sau['percent'] <- round(sau[,3]/(sau[,2]+sau[,3])*100,1)
#sau$cluster <- factor(sau[,1],levels=c(7,1,3,9,5,6,4,8,2,10)) # reorder factor levels
ggplot(sau, aes(y=percent,x=as.factor(cluster),fill=as.factor(cluster))) + theme(legend.position = "none",axis.text=element_text(size=16,face="bold"))+
geom_bar(position="stack", stat="identity") + xlab("Clusters- SAU's") + ylab("Percent SAU's") +
theme(
#legend.title = element_text(color = "black", size = 16),
#legend.text = element_text(color = "black", size = 10,face="bold"),
axis.text=element_text(size=16,face="bold") + scale_fill_manual(values=c("#48AC98", "#E5C3C7", "#CAD7CA","#266678","#C8D7E5","#CDA35F","#F5D5B9"))
) + scale_fill_manual(values=c("#48AC98", "#E5C3C7","#CB7C7A", "#CAD7CA","#CDA35F","#C8D7E5","#266678","#F5D5B9","#737373","#CCCCCC",
"#D3D3D3",
"#999999",
"#8DA290","#CAA7BD",
"#B4BAD4",
'#B88BAD',
"#DBC4DF",
"#D2C1CE",
"#FDF7C2")) + geom_text(aes(label = percent, y = percent + 0.40), position = position_dodge(0.9))
# EXAMPLE OF INTERNATIONAL AND SECONDARY SPENDER OVERLAPPING CLUSTERS
# Note that the cluster numbers are assigned randomly and will be different each time you rerun the clustering analysis so you will have look at the data to determine the cluster behaviors.
# in this clustering example, cluster 8 = international travelers and cluster 6 = secondary spenders
# Information on cluster neighbors and silohuette widths for each user
silo8 <- km.res2_8$silinfo$widths
silo9 <- km.res2_9$silinfo$widths
silo10 <- km.res2_10$silinfo$widths
silo11 <- km.res2_11$silinfo$widths
# users with negative silohuette values are located on the boarder of neighboring cluster
silo10.neg <-silo10[silo10[,3] < 0,]
head(silo10.neg) # users with negative cluster widths and corresponding neighboring clusters
## cluster neighbor sil_width
## 142442 2 1 -0.0004527271
## 7080 2 6 -0.0005589185
## 126911 2 6 -0.0008426847
## 140313 2 1 -0.0008540853
## 18539 2 9 -0.0008883909
## 86200 2 6 -0.0008939617
# example of negative width values for international travelers cluster
# ~65% of these users are neighbors with users from Secondary spender cluster
table(silo10.neg[silo10.neg[,1] == 8,2]) #change cluster number accordingly
##
## 1 2 4 6 9
## 37 49 2 174 4
length(silo10.neg[silo10.neg[,1] == 8,2])
## [1] 266
# in this clustering example, cluster 8 = international travelers and cluster 6 = secondary spenders
idx <- rownames(silo10.neg[(silo10.neg[,1] == 8) & (silo10.neg[,2] == 6),])
m.pc <- transform(merge(pc,hk.df,by=0), row.names=Row.names, Row.names=NULL)
pc.overlap <- m.pc[(m.pc[,35]==8) | (m.pc[,35]==6),]
pc.overlap$colors <- case_when(rownames(pc.overlap) %in% idx ~ "#E69F00", #overlap (yellow)
(!rownames(pc.overlap) %in% idx) & (pc.overlap$cluster == 8) ~ "#999999", #international travelers (grey)
pc.overlap$cluster == 6 ~ "#56B4E9" #secondary spenders (blue)
)
pc.overlap$cluster1 <- case_when(rownames(pc.overlap) %in% idx ~ "overlap", #overlap (yellow)
(!rownames(pc.overlap) %in% idx) & (pc.overlap$cluster == 8) ~ "inter_traveler", #international travelers (grey)
pc.overlap$cluster == 6 ~ "secondary_spender" #secondary spenders (blue)
)
# SCATTER PLOT OF THE CLUSTER OVERLAP
library(scatterplot3d)
scatterplot3d(pc.overlap[,c(1,3,2)], pch=6, color=pc.overlap$colors)
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=6)
scatterplot3d(pc.overlap[,c(2,3,1)], pch=6, color=pc.overlap$colors)
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=6)
scatterplot3d(pc.overlap[,c(3,2,1)], pch=6, color=pc.overlap$colors)
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=6)
scatterplot3d(pc.overlap[,c(1,2,3)], pch=6, color=pc.overlap$colors)
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=6)
plot(pc.overlap[,1],pc.overlap[,2],col=pc.overlap$colors,pch=20,xlab='dim1',ylab='dim2')
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=20)
# Lets take a look at the differences between these 3 groups
feature.df <- transform(merge(df,pc.overlap,by=0), row.names=Row.names, Row.names=NULL)
features <- feature.df[,colnames(feature.df) %in% c("weeks_wau_txn","pt_dom_sum","pt_inter_sum","cluster1","colors")]
features <- melt(features,id=c('cluster1',"colors"))
mean_df10 = as.data.frame(features %>%
group_by(cluster1,colors,variable) %>%
summarise_all("mean"))
ggplot(mean_df10, aes(x=as.factor(cluster1), y=value,fill=as.factor(cluster1))) +
geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25) + theme(legend.position = "none",axis.text=element_text(size=16,face="bold")) +
facet_grid(variable ~ ., scales = "free_y") + scale_fill_manual(values=c("inter_traveler"="#999999","overlap" = "#E69F00", "secondary_spender"="#56B4E9"))
## Results Note that the cluster numbers are assigned randomly and will be different each time you rerun the clustering analysis so you will have look at the data to determine the cluster behaviors.
The yellow dots are the users with negative width values from the cluster corresponding to international travelers (grey). These users are located next to the secondary spender cluster (blue). This suggests that these users may exhibit behavior that are intermediate between the two groups. The overlapping customers use our product for both international travel and domestic spending and seem to be more active than users that are using the app primarily for international spend.